In [1]:
import pickle
In [3]:
with open('result_list.pickle','rb')  as f:
    data = pickle.load(f)
In [4]:
clean_d=[]
for i in data:
    clean_d.append(i.replace('Within Set Sum of Squared Error = ','').replace('for k =','').split())
    
In [5]:
from matplotlib import pyplot as plt
In [6]:
x=[]
y=[]
for i in clean_d:
    x.append(int(i[1]))
    y.append(float(i[0]))
In [7]:
min(x)
Out[7]:
2
In [10]:
fig, ax = plt.subplots()

ax.grid(True)
ax.plot(x,y)
plt.axis([min(x),max(x),min(y),max(y)])
plt.show()
In [9]:
from pyspark.mllib.clustering import KMeans, KMeansModel
In [11]:
sameModel = KMeansModel.load(sc, "hdfs:///user/cluster_model/cluster_model_k_17")
In [13]:
with open('success_vec_protocol_1.pickle','rb') as objectf:
    data = pickle.load(objectf)
In [14]:
keys_rdd = sc.parallelize(list(data.keys()))
In [15]:
key_value_rdd = keys_rdd.map(lambda x: (x,data[x].reshape((4096,))))
In [16]:
array_rdd = key_value_rdd.map(lambda (k,v):v)
In [17]:
from scipy.spatial.distance import euclidean
In [18]:
distance = array_rdd.map(lambda point :(sameModel.predict(point),euclidean(point,sameModel.centers[sameModel.predict(point)])))
In [19]:
radius = distance.reduceByKey(max)
In [20]:
radius.collect()
Out[20]:
[(0, 384.53989663911057),
 (8, 350.2250297052688),
 (4, 286.99718308966584),
 (12, 366.1748040598126),
 (16, 260.2077201582386),
 (1, 303.509521851893),
 (13, 304.6827525488093),
 (5, 333.40630591665126),
 (9, 395.7292136978472),
 (2, 375.49031747982946),
 (14, 430.0844557188018),
 (10, 255.91134005287276),
 (6, 295.8650964110999),
 (11, 0.0),
 (15, 334.59170424230433),
 (3, 272.44843825318327),
 (7, 319.45044279866215)]
In [21]:
rd_hash = array_rdd.map(lambda point :(sameModel.predict(point),point))
In [22]:
rd_group = rd_hash.groupByKey()
In [23]:
group = rd_group.collect()
In [24]:
for i in group:
    print (i[0],len([j for j in i[1]]))
(0, 579)
(8, 440)
(4, 428)
(12, 494)
(16, 18)
(1, 934)
(13, 403)
(5, 512)
(9, 536)
(2, 891)
(14, 991)
(10, 355)
(6, 1048)
(11, 542)
(15, 502)
(3, 424)
(7, 794)
In [47]:
for i in group:
    if i[0]==16:
        d =[j for j in i[1]]
In [48]:
for idx,val in enumerate(d):
    d[idx]=val.reshape((1,4096))
In [49]:
import random
In [51]:
test_d = d
In [200]:
#test_d = d
In [52]:
reverse_hash = key_value_rdd.map(lambda(k,v):(k,v.reshape((1,4096))))
In [53]:
t = reverse_hash.take(1)
In [54]:
t[0][1].shape
Out[54]:
(1, 4096)
In [55]:
d[0].shape
Out[55]:
(1, 4096)
In [56]:
import numpy as np
In [57]:
brod = sc.broadcast(test_d)
In [58]:
def fil(k,v):
    for i in brod.value:
        if np.array_equal(i,v):
            return k
In [59]:
subset = reverse_hash.filter(lambda (k,v):fil(k,v)).collect()
In [60]:
len(subset)
Out[60]:
18
In [41]:
subset[0][0]
Out[41]:
u'cdn-3.eneighborhoods.com_x2_@v=545746007@_103_0_027_1515027_1515027_16.jpg'
In [42]:
img_dir = '/mnt/homes_img'
In [ ]:
 
In [43]:
%matplotlib inline

import matplotlib.pyplot as plt
from scipy.ndimage import imread
In [61]:
for i in subset:
    test_img = imread(img_dir+"/"+i[0])
    print (i[0])
    plt.imshow(test_img)
    plt.show()
cdn.homes.com_cgi-bin_readimage_e0c5049c9f47282d86ef812f785f2f86_664-new-mayo-dr-roxboro-nc-27574-14.jpg
cdn.homes.com_cgi-bin_readimage_12d82ad35b2569f02165dc0bac66d639_50-millie-park-jackson-tn-38305-16.jpg
cdn.homes.com_cgi-bin_readimage_5794706560_43190-heritage-dr-leonardtown-md-20650-23.jpg
cdn.homes.com_cgi-bin_readimage_5805927152_12196-linda-flora-dr-ojai-ca-93023-11.jpg
cdn-2.eneighborhoods.com_x2_@v=1916507217@_2_1_141_o5511141_o5511141_14.jpg
cdn-3.eneighborhoods.com_x2_@v=-763375325@_103_6_634_1711634_1711634_2.jpg
cdn.homes.com_cgi-bin_readimage_38fca501110219f5319c79783b8764ea_2520-se-nottingham-drive-lees-summit-mo-64063-9.jpg
cdn.homes.com_cgi-bin_readimage_6e5c9f3ff5c0ab0e801f14fcdacbed4a_119-woodland-rd-johnson-city-tn-37601-19.jpg
cdn-2.eneighborhoods.com_x2_@v=32249640@_122_0_070_5620070_5620070_12.jpg
cdn.homes.com_cgi-bin_readimage_da1ce22fc88bcae3605854ab55344aa0_7749-basswood-dr-chattanooga-tn-37416-11.jpg
cdn.homes.com_cgi-bin_readimage_d3bb52114bb13b5909c52a4992bc8223_18-sundance-trl-pelzer-sc-29669-20.jpg
cdn.homes.com_cgi-bin_readimage_9cc7c508f7f109151665795a5889cf30_14161-hickory-rise-ct-roscoe-il-61073-17.jpg
cdn.homes.com_cgi-bin_readimage_e96c87f58a67b4636bb6bf13c2f94d7a_9116-ore-bank-rd-port-republic-va-24471-21.jpg
cdn-8.eneighborhoods.com_x2_@v=983448127@_2558_8_872_20172872_20172872_26.jpg
cdn.homes.com_cgi-bin_readimage_f31559f2e75c0912a0f58a79e899e0b2_1594-meadow-creek-church-rd-locust-nc-28097-9.jpg
cdn-2.eneighborhoods.com_x2_@v=12633872@_122_4_467_5596467_5596467_15.jpg
cdn.homes.com_cgi-bin_readimage_71378f9ab91937a065f9fad4170d8212_681-n-beaverdam-dr-florence-sc-29501-18.jpg
cdn.homes.com_cgi-bin_readimage_82c5981a7331f161005e37fa8f4a2703_5780-stoneridge-dr-fairview-pa-16415-9.jpg
In [ ]: